Load the data set you exported in the final Task of Case Study 2.

library(readr)
library(tidyr)
library(dplyr)
## 
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
## 
##     filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
library(data.table)
## 
## Attache Paket: 'data.table'
## Die folgenden Objekte sind maskiert von 'package:dplyr':
## 
##     between, first, last
library(knitr)
require(stringr)
## Lade nötiges Paket: stringr
library(ggplot2)
library(plotly)
## 
## Attache Paket: 'plotly'
## Das folgende Objekt ist maskiert 'package:ggplot2':
## 
##     last_plot
## Das folgende Objekt ist maskiert 'package:stats':
## 
##     filter
## Das folgende Objekt ist maskiert 'package:graphics':
## 
##     layout
library(htmltools)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(installr)
## 
## Welcome to installr version 0.23.4
## 
## More information is available on the installr project website:
## https://github.com/talgalili/installr/
## 
## Contact: <tal.galili@gmail.com>
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/installr/issues
## 
##          To suppress this message use:
##          suppressPackageStartupMessages(library(installr))
library(scales)
## 
## Attache Paket: 'scales'
## Das folgende Objekt ist maskiert 'package:readr':
## 
##     col_factor
library(rworldmap)
## Lade nötiges Paket: sp
## ### Welcome to rworldmap ###
## For a short introduction type :   vignette('rworldmap')

In our csv the missing values are represented by dot character “.”, so we will specify it when parsing.

data <- read.csv("final_data.csv", sep = ";", stringsAsFactors = FALSE, na.strings = ".")

data <- data %>%
  mutate(across(where(is.character), trimws))

print(head(data))
##          ISO                   Country continent       sub.region Income_class
## 1 MC|MCO|492                    Monaco    Europe   Western Europe            H
## 2 JP|JPN|392                     Japan      Asia     Eastern Asia            H
## 3 PM|SPM|666 Saint Pierre and Miquelon  Americas Northern America         <NA>
## 4 DE|DEU|276                   Germany    Europe   Western Europe            H
## 5 IT|ITA|380                     Italy    Europe  Southern Europe            H
## 6 AD|AND|020                   Andorra    Europe  Southern Europe            H
##   Median_Age Youth_unemployment_rate Migration_rate
## 1       55.4                    26.6            8.3
## 2       48.6                     3.6            0.0
## 3       48.5                      NA           -7.7
## 4       47.8                     6.2            1.5
## 5       46.5                    32.2            3.2
## 6       46.2                      NA            0.0
print(paste("Number of observations:", nrow(data)))
## [1] "Number of observations: 227"

Eliminate all observations with missing values in the income status variable.

clean_data <- data %>%
  filter(!is.na(Income_class))


print(head(clean_data))
##          ISO Country continent      sub.region Income_class Median_Age
## 1 MC|MCO|492  Monaco    Europe  Western Europe            H       55.4
## 2 JP|JPN|392   Japan      Asia    Eastern Asia            H       48.6
## 3 DE|DEU|276 Germany    Europe  Western Europe            H       47.8
## 4 IT|ITA|380   Italy    Europe Southern Europe            H       46.5
## 5 AD|AND|020 Andorra    Europe Southern Europe            H       46.2
## 6 GR|GRC|300  Greece    Europe Southern Europe            H       45.3
##   Youth_unemployment_rate Migration_rate
## 1                    26.6            8.3
## 2                     3.6            0.0
## 3                     6.2            1.5
## 4                    32.2            3.2
## 5                      NA            0.0
## 6                    39.9            0.9
print(paste("Number of observations:", nrow(clean_data)))
## [1] "Number of observations: 184"

Also, we had a problem with missing ISO codes for Macedonia and Turkey in our exported in case study 2 data set, but these 2 countries are already eliminated from analyses, since they had missing values in the income status variable.

a. Median age in different income levels

clean_data <- clean_data %>%
  mutate(Income_class = factor(Income_class, levels = c("H", "UM", "LM", "L")))

density_plot <- ggplot(clean_data, aes(x = Median_Age, fill = Income_class)) +
  geom_density(alpha = 0.5, color = "black") +
  scale_fill_manual(values = c("H" = "red", "UM" = "green", "LM" = "blue", "L" ="orange")) +
  labs(x = "Median age of population", fill = element_blank()) +
  theme(legend.position = "top", legend.title = element_blank())

print(density_plot)

The plot demonstrates that the low-income countries have the smallest spread of median age of the population, while countries of other income statutes have the wider spread. Also, high level income countries tend to have the highest median age among represented categories, indicating an older population, while the opposite situation is observed for countries of low income. Overall, we can spot a tendency that the higher income level country has, the higher median age of population it owns.

b. Income status in different continents

Using ggplot2, create a stacked barplot of absolute frequencies showing how the entities are split into continents and income status.

clean_data_present_continent <- clean_data[!is.na(clean_data$continent), ]

absolute_freq_barplot <- ggplot(clean_data_present_continent, aes(x = continent, 
                      fill = Income_class)) + geom_bar(position = "stack") +
  labs(x = "Continent", y = "Number of Countries", fill = "Income Class") +
                                                          theme_minimal()

print(absolute_freq_barplot)

We can see that Africa has the greatest number of low-income countries and the lowest proportion of high-income countries among all other continents. Additionally to Africa, only Asia has one low-income country represented. Europe has the highest number of high-income countries and only one lower-middle income country. Oceania has equal number of lower- and upper- middle countries (10 each) but the highest number (17) is of high-income countries. Prevailing countries in Amerias is upper-middle income countries.

Create another stacked barplot of relative frequencies (height of the bars should be one).

relative_freq_plot <- ggplot(clean_data_present_continent, aes(x = continent, fill = Income_class)) +
  geom_bar(position = "fill") +
  labs(x = "Continent", y = "Proportion of Countries", fill = "Income Class") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()

print(relative_freq_plot)

This plot gives us a better understanding of proportion of each income class countries for each continent. Now it is easier to determine that for example over 75% countries in Europe are high-income countries, while in Asia - around 26%

Create a mosaic plot of continents and income status using base R functions.

mosaicplot(table(clean_data_present_continent$continent, 
                 clean_data_present_continent$Income_class),
           main = "Mosaic Plot of Continents and Income Status",
           xlab = "Continent",
           ylab = "Income Class")

Overall, each plot offers a different perspective on the data, making it easier to understand the distribution of income statuses across continents. The barplot of absolute frequencies provides absolute viewpoints, allowing to detect the exact number of countries on some continent in specific category. The barplot of relative frequencies provides relative viewpoints, allowing to detect the proportion of countries of some status on the continent. The mosaic plot adds an additional layer of understanding by visualizing the relationship between the two categorical variables ().

c. Income status in different subcontinents

asia_data <- clean_data_present_continent[clean_data_present_continent$continent 
                                          == "Asia", ]

relative_freq_asia <- prop.table(table(asia_data$sub.region, 
                                       asia_data$Income_class), margin = 1)

relative_freq_df_asia <- as.data.frame(relative_freq_asia)
names(relative_freq_df_asia) <- c("Subcontinent", "Income_class", "Proportion")


relative_freq_plot_asia <- ggplot(relative_freq_df_asia, aes(x = Subcontinent, 
                                      y = Proportion, fill = Income_class)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(x = "Subcontinent", y = "Proportion of Countries", fill = "Income Class")+
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(relative_freq_plot_asia)

Plot demonstrates that the highest proportion of high-income countries is in Western Asia, while there are no high-income countries in Southern and Central Asia at all. The prevailing countries in Southern Asia are of lower-middle income level. In Central Asia there is an even distribution between lower- and upper- middle income countries.

d. Net migration in different continents

boxplot_migration <- ggplot(clean_data_present_continent, aes(x = continent,
                      y=Migration_rate, fill = continent)) +  geom_boxplot() +
  labs(x = "Continent", y = "Net Migration Rate", fill = "Continent") + 
  theme_minimal()

asia_largest_negative_outlier <- asia_data[which.max(asia_data$Migration_rate),
                                           "Country"]
asia_largest_positive_outlier <- asia_data[which.min(asia_data$Migration_rate), 
                                           "Country"]

print(boxplot_migration)

cat("Largest Negative Outlier in Asia:", asia_largest_negative_outlier, "\n")
## Largest Negative Outlier in Asia: Singapore
cat("Largest Positive Outlier in Asia:", asia_largest_positive_outlier, "\n")
## Largest Positive Outlier in Asia: Lebanon

We can see that Asia has some severe outliers, which were detected (Singapore and Lebanon), while other countries also have outlying values but not to such high extent as Asia. All continents have more countries with negative migration rate except Europe where number of countries with positive and negative migration rates is balanced. Prevailing number of countries in Oceania have negative migration rate.

e. Net migration in different subcontinents

subcontinent_migration <- clean_data_present_continent %>%
  filter(!is.na(sub.region)) %>%
  filter(!is.na(Migration_rate))


migration_boxplot <- ggplot(subcontinent_migration, aes(x = sub.region, 
  y = Migration_rate, fill = continent)) + geom_boxplot() +
  facet_grid(~continent, scales = "free_x") + 
  labs(x = "Subcontinent", y = "Net Migration Rate", fill = "Continent") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  

print(migration_boxplot)

Now we see more information about which regions on the continent experience prevailing negative/positive migration rate. For example, now we can clearer detect that Micronesia and Polynesia in majority have countries with negative migration rate, and that migration rate in Northern America is in general higher than the in Latin America and the Caribbean.

f. Median net migration rate per subcontinent

library(forcats)

median_migration <- clean_data_present_continent %>%
  group_by(sub.region) %>%
  summarise(median_migration_rate = median(Migration_rate, na.rm = TRUE),
            continent = first(continent)) %>%
  ungroup() %>%
  mutate(sub.region = fct_reorder(sub.region, median_migration_rate))

median_migration_plot <- ggplot(median_migration, aes(x = median_migration_rate,
                                          y = sub.region, color = continent)) +
  geom_point() +
  labs(x = "Median Net Migration Rate", y = "Sub-Region", color = "Continent") +
  scale_color_brewer(palette = "Set1") +  
  theme_minimal() +
  theme(legend.position = "top")

print(median_migration_plot)

The plot shows us that majority of regions have median net migration rate between -2 and 0. The most negative median migration rate is possessed by Polynesia and Micronesia subregions, while the most positive by Australia and New Zealand. Western and Northern Europe also has relatively high positive median migration rate.

g. Median youth unemployment rate per subcontinent

median_unemployment <- clean_data_present_continent %>%
  group_by(sub.region) %>%
  summarise(median_unemployment_rate = median(Youth_unemployment_rate, na.rm = TRUE),
            continent = first(continent)) %>%
  ungroup() %>%
  mutate(sub.region = fct_reorder(sub.region, median_unemployment_rate))

median_unemployment_barplot <- ggplot(median_unemployment, aes(x = median_unemployment_rate,
  y = sub.region, fill = continent)) +
  geom_bar(stat = "identity", alpha = 0.7) +
  labs(x = "Median Youth Unemployment Rate", y = "Sub-Region", fill = "Continent") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw() +
  theme(legend.position = "top")

print(median_unemployment_barplot)

We can see that Northern Africa has significantly the highest youth unemployment rate, followed by Polynesia, in Oceania and Southern Europe. Central Asia and South-eastern Asia have the lowest rates. Moreover, we can also notice variation within the continents. Oceania and Europe show a broader range or the median youth unemployment rates.

h. Median youth unemployment rate per subcontinent – with error bars

iqr_unemployment <- clean_data_present_continent %>%
  group_by(sub.region) %>%
  summarise(
    Q1 = quantile(Youth_unemployment_rate, 0.25, na.rm = TRUE),
    Q3 = quantile(Youth_unemployment_rate, 0.75, na.rm = TRUE)
  )

median_iqr_unemployment <- merge(median_unemployment, iqr_unemployment, by = "sub.region")

ggplot(median_iqr_unemployment, aes(x = median_unemployment_rate, y = sub.region, color = continent)) +
  geom_point() +
  geom_errorbar(aes(ymin = Q1, ymax = Q3), width = 0.2) +
  labs(x = "Median Youth Unemployment Rate", y = "Sub-Region", color = "Continent") +
  scale_color_brewer(palette = "Set1") +
  theme_minimal() +
  theme(legend.position = "top")

Sub-regions with shorter bars, like Northern Europe or Central Asia, have more consistency in the rates, while sub-regions with longer bars, such as Western Asia or Sub-Saharan Africa, indicate a wider variation in the youth unemployment rates. We can also, see some trends, for instance, European sub-regions have generally higher rates, while sub-regions in Oceania show high variability in the unemployment rates.

i. Relationship between median age and net migration rate

ggplot(clean_data, aes(x = Median_Age, y = Migration_rate, color = Income_class)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Median Age", y = "Net Migration Rate", color = "Income Status") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Based on the plot, it seems that the high-income countries have a positive correlation between the median age and net migration rate. There also appear to be some variability for the rest of the categories when it comes to the net migration rates. In general, we see that as the median age increases, the net migration rate also tends to increase, particularly in the high income category, suggesting that in countries with older population, having more migrants coming is needed for the workforce.

j. Relationship between youth unemployment and net migration rate

unemployment_migration_plot <- ggplot(clean_data_present_continent, 
                                      aes(x = Youth_unemployment_rate, 
                                          y = Migration_rate, 
                                          color = continent)) +
  geom_point(size = 3, alpha = 0.7) +  
  geom_smooth(method = "lm", se = FALSE) +  
  labs(x = "Youth Unemployment Rate", 
       y = "Net Migration Rate", 
       color = "Continent") +
  scale_color_brewer(palette = "Set1") +  
  theme_minimal() +
  theme(legend.position = "top")

# Print the plot
print(unemployment_migration_plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 26 rows containing missing values or values outside the scale range
## (`geom_point()`).

k. Merging population data

Data got from https://databank.worldbank.org/source/population-estimates-and-projections#

population_data <- read.csv("population_data.csv")

population_data <- population_data %>%
  select(Country.Code, Population = X2020..YR2020.)

head(population_data)
clean_data_present_continent <- clean_data_present_continent %>%
  mutate(ISO_Code_3 = str_extract(ISO, "\\|([A-Z]{3})\\|") %>%
           str_remove_all("\\|"))

head(clean_data_present_continent)
merged_data <- clean_data_present_continent %>%
  left_join(population_data, by = c("ISO_Code_3" = "Country.Code"))

merged_data$Population <- as.numeric(as.character(merged_data$Population))
head(merged_data)

l. Scatterplot of median age and net migration rate in Europe

europe_data <- merged_data %>% 
               filter(continent == "Europe")
head(europe_data)
scatterplot_age_umeployment <- ggplot(europe_data, aes(x = Median_Age, y = Migration_rate, size = Population)) + 
  geom_point(alpha = 0.7) + 
  labs(
    title="Scatterplot Median Age and Migration Rate Europe",
    x = "Median Age",
    y = "Migration Rate"
  ) +
  theme(legend.position = "none")

print(scatterplot_age_umeployment)

Without a legend we can’t make statements about certain countries. But we can see that for most European countries the median age is between 40 and 47. These countries also mostly have a migration rate of 0 to 5 percent. That means that for most European countries migration has a small positive or no effect on the population. Within this group are also the most populous European countries. The outliers are mostly countries with a tiny population.

m. Interactive plot

world_plot_l <- ggplot(merged_data, aes(x = Median_Age, y = Migration_rate, size = Population, color = as.factor(continent), text = Country)) + 
  geom_point(alpha = 0.7) + 
  labs(
    title = "Scatterplot of Median Age and Migration Rate for all Countries",
    x = "Median Age",
    y = "Migration Rate",
    size = "Population",
    color = "continent",
  ) +
  theme(legend.position = "none")

interactive_plot_l <- ggplotly(world_plot_l, tooltip = c("text", "x", "y", "size"))
interactive_plot_l

n. Parallel coordinate plot

ggparcoord(merged_data, 
           columns = c("Median_Age", "Youth_unemployment_rate", "Migration_rate"),
           groupColumn = "Income_class",
           order = 6:8,
           showPoints = TRUE,
           scale = "globalminmax",
           title = "Parallel Coordinate Plot of all Countries",
           alphaLines = 0.5)

In the creation of the plot we tried different scalings to give a better view on these many datapoints. To also keep the values make sense, we decided on “globalminmax”. We can make statements about the data by Income class. Countries with high income tend to have rather low youth unempolment (however with some considerable outliers) and a rather high migration rate. Countries with upper middle, lower middle and low income tend to not have a trend for youth unemployment, for migration rate these countries make up the lower half of values, to rates from lower one digit numbers to negative values.

o. World map visualisation

data(countryExData)

merged_o <- countryExData %>%
  left_join(merged_data, by = c("ISO3V10" = "ISO_Code_3"))

sPDF <-joinCountryData2Map(merged_o)
## 149 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 94 codes from the map weren't represented in your data
par(mai=c(0,0,0.2,0),xaxs="i",yaxs="i")
mapCountryData(sPDF, nameColumnToPlot="Median_Age", mapTitle="Median Age by Country", catMethod="fixedWidth", missingCountryCol="gray50")